#include <iostream>
#include<sys/time.h>
#include<arm_neon.h>
#include<pthread.h>
#include <malloc.h>
#include <semaphore.h>
using namespace std;
const int worker_count=7;
const int N=4000;
float m[N][N];
void m_reset()
 {
    for(int i=0;i<N;i++)
    {
        for(int j=0;j<i;j++)
            m[i][j]=0;
        m[i][i]=1.0;
        for(int j=i+1;j<N;j++)
            m[i][j]=rand()%1000+1;
    }
    for(int k=0;k<N;k++)
        for(int i=k+1;i<N;i++)
            for(int j=0;j<N;j++)
                m[i][j]+=m[k][j];
}
typedef struct{
    int k;
    int t_id;
}threadParam_t;


void *threadFunc(void *param)
{
     threadParam_t *p=(threadParam_t*)param;
     int k=p->k;
     int t_id=p->t_id;
 int i=k+t_id+1;
        int j;
       for(;i<N;i+=worker_count)
    {
        float32x4_t vaik=vmovq_n_f32(m[i][k]);
        for(j=k+1;j+4<=N;j+=4)
        {
            float32x4_t vakj=vld1q_f32(&m[k][j]);
            float32x4_t vaij=vld1q_f32(&m[i][j]);
            float32x4_t vx=vmulq_f32(vakj,vaik);
            vaij=vsubq_f32(vaij,vx);
            vst1q_f32(&m[i][j],vaij);
        }                                                                                                                                                                                                              
        for(;j<N;j++)
        {
            m[i][j]=m[i][j]-m[k][j]*m[i][k];
        }
        m[i][k]=0;
    }
    pthread_exit(NULL);
}
int main()
{
    m_reset();
    struct  timeval   tv_begin,tv_end;
    unsigned  long dynamic_pthread_simd_time;

    gettimeofday(&tv_begin,NULL);
	   int j;
    for(int k=0;k<N;k++)
    {
        float32x4_t vt=vmovq_n_f32(m[k][k]);
        for( j=k+1;j+4<N;j+=4)
        {
            float32x4_t va=vld1q_f32(&m[k][j]);
            va=vdivq_f32(va,vt);
            vst1q_f32(&m[k][j],va);
        }
        for(;j<N;j++)
        {
            m[k][j]=m[k][j]/m[k][k];
        }
        m[k][k]=1.0;


        pthread_t* handles = (pthread_t*)malloc(worker_count*sizeof(pthread_t));
        threadParam_t* param = (threadParam_t*)malloc(worker_count*sizeof(pthread_t));
        for(int t_id=0;t_id<worker_count;t_id++)
        {
            param[t_id].k=k;
            param[t_id].t_id=t_id;
        }
        for(int t_id=0;t_id<worker_count;t_id++)
        {
            pthread_create(&handles[t_id],NULL,threadFunc,(void *)&param[t_id]);
        }
        for(int t_id=0;t_id<worker_count;t_id++)
        {
            pthread_join(handles[t_id],NULL);
        }
    }
     gettimeofday(&tv_end,NULL);
    dynamic_pthread_simd_time=1000000 * (tv_end.tv_sec-tv_begin.tv_sec)+ tv_end.tv_usec-tv_begin.tv_usec;
    cout<<"dynamic_pthread_simd_time:"<<dynamic_pthread_simd_time<<endl;

    return 0;
}